crime = read.csv("/Users/amoghramagiri/Documents/mid_term_project/pro/Crime_Data_from_2020_to_Present.csv")
head(crime)
nrow(crime)
ncol(crime)
The dataset contains 28 columns and 986500 rows.
na_count <- colSums(is.na(crime))
print("\nNA Count per Column:")
## [1] "\nNA Count per Column:"
print(na_count)
## DR_NO Date.Rptd DATE.OCC TIME.OCC AREA
## 0 0 0 0 0
## AREA.NAME Rpt.Dist.No Part.1.2 Crm.Cd Crm.Cd.Desc
## 0 0 0 0 0
## Mocodes Vict.Age Vict.Sex Vict.Descent Premis.Cd
## 0 0 0 0 14
## Premis.Desc Weapon.Used.Cd Weapon.Desc Status Status.Desc
## 0 660132 0 0 0
## Crm.Cd.1 Crm.Cd.2 Crm.Cd.3 Crm.Cd.4 LOCATION
## 11 917588 984190 986436 0
## Cross.Street LAT LON
## 0 0 0
new_crime <- crime[, colSums(is.na(crime)) == 0]
head(new_crime)
## DR_NO Date.Rptd DATE.OCC TIME.OCC AREA
## 1 190326475 03/01/2020 12:00:00 AM 03/01/2020 12:00:00 AM 2130 7
## 2 200106753 02/09/2020 12:00:00 AM 02/08/2020 12:00:00 AM 1800 1
## 3 200320258 11/11/2020 12:00:00 AM 11/04/2020 12:00:00 AM 1700 3
## 4 200907217 05/10/2023 12:00:00 AM 03/10/2020 12:00:00 AM 2037 9
## 5 220614831 08/18/2022 12:00:00 AM 08/17/2020 12:00:00 AM 1200 6
## 6 231808869 04/04/2023 12:00:00 AM 12/01/2020 12:00:00 AM 2300 18
## AREA.NAME Rpt.Dist.No Part.1.2 Crm.Cd
## 1 Wilshire 784 1 510
## 2 Central 182 1 330
## 3 Southwest 356 1 480
## 4 Van Nuys 964 1 343
## 5 Hollywood 666 2 354
## 6 Southeast 1826 2 354
## Crm.Cd.Desc Mocodes Vict.Age
## 1 VEHICLE - STOLEN 0
## 2 BURGLARY FROM VEHICLE 1822 1402 0344 47
## 3 BIKE - STOLEN 0344 1251 19
## 4 SHOPLIFTING-GRAND THEFT ($950.01 & OVER) 0325 1501 19
## 5 THEFT OF IDENTITY 1822 1501 0930 2004 28
## 6 THEFT OF IDENTITY 1822 0100 0930 0929 41
## Vict.Sex Vict.Descent Premis.Desc
## 1 M O STREET
## 2 M O BUS STOP/LAYOVER (ALSO QUERY 124)
## 3 X X MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)
## 4 M O CLOTHING STORE
## 5 M H SIDEWALK
## 6 M H SINGLE FAMILY DWELLING
## Weapon.Desc Status Status.Desc LOCATION
## 1 AA Adult Arrest 1900 S LONGWOOD AV
## 2 IC Invest Cont 1000 S FLOWER ST
## 3 IC Invest Cont 1400 W 37TH ST
## 4 IC Invest Cont 14000 RIVERSIDE DR
## 5 IC Invest Cont 1900 TRANSIENT
## 6 IC Invest Cont 9900 COMPTON AV
## Cross.Street LAT LON
## 1 34.0 -118
## 2 34.0 -118
## 3 34.0 -118
## 4 34.2 -118
## 5 34.1 -118
## 6 33.9 -118
nrow(new_crime)
## [1] 986500
ncol(new_crime)
## [1] 22
# Feature Selection based on Smart Questions
cols_to_remove <- c(
"Mocodes", "Rpt.Dist.No", "Part.1.2",
"Premis_cd","Premis.Desc", "Status", "Status.Desc","Cross.Street"
)
# Drop the specified columns
crime_data <- new_crime[, !(names(new_crime) %in% cols_to_remove)]
print("Data after removing unnecessary columns:")
## [1] "Data after removing unnecessary columns:"
print(names(crime_data))
## [1] "DR_NO" "Date.Rptd" "DATE.OCC" "TIME.OCC" "AREA"
## [6] "AREA.NAME" "Crm.Cd" "Crm.Cd.Desc" "Vict.Age" "Vict.Sex"
## [11] "Vict.Descent" "Weapon.Desc" "LOCATION" "LAT" "LON"
How have the top three most common crimes from 2020 to the present been distributed across the top five areas where they are most frequently committed in Los Angeles, and are these trends increasing or decreasing in each area from 2020 to 2024?
library(dplyr)
library(ggplot2)
library(lubridate)
# Ensure date is in correct format
crime_data$DATE.OCC <- as.Date(crime_data$DATE.OCC, format = "%m/%d/%Y")
# Identify top 3 crimes overall from 2020 to present
top_crimes <- crime_data %>%
filter(year(DATE.OCC) >= 2020 & year(DATE.OCC) <= 2023 ) %>%
group_by(Crm.Cd, Crm.Cd.Desc) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
head(3)
crime_summary <- crime_data %>%
filter(year(DATE.OCC) >= 2020) %>%
group_by(AREA.NAME) %>%
summarise(
total_crime = n(),
top3_crime = sum(Crm.Cd %in% top_crimes$Crm.Cd)
) %>%
mutate(top3_pct = top3_crime / total_crime * 100)
top_areas <- crime_summary %>%
arrange(desc(top3_crime)) %>%
head(5)
# Filter data for top 3 crimes from 2020 onwards
crime_filtered <- crime_data %>%
filter(year(DATE.OCC) >= 2020, Crm.Cd %in% top_crimes$Crm.Cd)
# Create separate plots with angled x-axis labels
p1 <- ggplot(
subset(crime_filtered, Crm.Cd.Desc == "BATTERY - SIMPLE ASSAULT"),
aes(x = AREA.NAME)
) +
geom_bar(fill = "#4e79a7") +
theme_minimal() +
labs(
title = "BATTERY - SIMPLE ASSAULT Distribution by Area",
x = "Area Name",
y = "Number of Incidents"
) +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), # Angled labels
panel.grid.minor = element_blank(),
plot.margin = margin(20, 20, 40, 20) # Increased bottom margin for labels
)
p2 <- ggplot(
subset(crime_filtered, Crm.Cd.Desc == "THEFT OF IDENTITY"),
aes(x = AREA.NAME)
) +
geom_bar(fill = "#3d8f7d") +
theme_minimal() +
labs(
title = "THEFT OF IDENTITY Distribution by Area",
x = "Area Name",
y = "Number of Incidents"
) +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), # Angled labels
panel.grid.minor = element_blank(),
plot.margin = margin(20, 20, 40, 20) # Increased bottom margin for labels
)
p3 <- ggplot(
subset(crime_filtered, Crm.Cd.Desc == "VEHICLE - STOLEN"),
aes(x = AREA.NAME)
) +
geom_bar(fill = "#59a14f") +
theme_minimal() +
labs(
title = "VEHICLE - STOLEN Distribution by Area",
x = "Area Name",
y = "Number of Incidents"
) +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), # Angled labels
panel.grid.minor = element_blank(),
plot.margin = margin(20, 20, 40, 20) # Increased bottom margin for labels
)
# Display plots one after another
print(p1)
print(p2)
print(p3)
Top 3
Crimes:
BATTERY - SIMPLE ASSAULT THEFT OF IDENTITY VEHICLE - STOLEN
# 1. Create top_crimes (2020-2023)
top_crimes <- crime_data %>%
filter(year(DATE.OCC) >= 2020 & year(DATE.OCC) <= 2023) %>%
group_by(Crm.Cd, Crm.Cd.Desc) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
head(3)
# 2. Create top_areas (2020-2023)
top_areas <- crime_data %>%
filter(year(DATE.OCC) >= 2020 & year(DATE.OCC) <= 2023) %>%
group_by(AREA.NAME) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
head(5)
# 3. Create crime_counts with verified column names
crime_counts <- crime_data %>%
filter(year(DATE.OCC) >= 2020 & year(DATE.OCC) <= 2023,
AREA.NAME %in% top_areas$AREA.NAME,
Crm.Cd %in% top_crimes$Crm.Cd) %>%
group_by(AREA.NAME, Year = year(DATE.OCC), Crm.Cd.Desc) %>%
summarise(crime_count = n(), .groups = 'drop')
# 4. Trend analysis
trend_analysis <- crime_counts %>%
group_by(AREA.NAME, Crm.Cd.Desc) %>%
summarise(
p_value = summary(lm(crime_count ~ Year))$coefficients[2,4],
r_squared = summary(lm(crime_count ~ Year))$r.squared,
.groups = 'drop'
)
# 5. Visualization
p1 <- ggplot(crime_counts,
aes(x = Year, y = crime_count, color = AREA.NAME)) +
geom_line() +
geom_point() +
facet_wrap(~Crm.Cd.Desc, scales = "free_y") +
theme_minimal() +
labs(title = "Crime Trends by Type and Area (2020-2023)",
x = "Year",
y = "Crime Count")
# Print results
print("\nTrend Analysis Results:")
## [1] "\nTrend Analysis Results:"
print(trend_analysis)
## # A tibble: 15 × 4
## AREA.NAME Crm.Cd.Desc p_value r_squared
## <chr> <chr> <dbl> <dbl>
## 1 77th Street BATTERY - SIMPLE ASSAULT 0.0991 0.812
## 2 77th Street THEFT OF IDENTITY 0.460 0.291
## 3 77th Street VEHICLE - STOLEN 0.0374 0.927
## 4 Central BATTERY - SIMPLE ASSAULT 0.0624 0.879
## 5 Central THEFT OF IDENTITY 0.399 0.361
## 6 Central VEHICLE - STOLEN 0.0333 0.935
## 7 Hollywood BATTERY - SIMPLE ASSAULT 0.165 0.698
## 8 Hollywood THEFT OF IDENTITY 0.697 0.0916
## 9 Hollywood VEHICLE - STOLEN 0.597 0.163
## 10 Pacific BATTERY - SIMPLE ASSAULT 0.260 0.548
## 11 Pacific THEFT OF IDENTITY 0.221 0.608
## 12 Pacific VEHICLE - STOLEN 0.171 0.687
## 13 Southwest BATTERY - SIMPLE ASSAULT 0.970 0.000872
## 14 Southwest THEFT OF IDENTITY 0.541 0.211
## 15 Southwest VEHICLE - STOLEN 0.00149 0.997
print(p1)
1.
Strong Evidence of Trends (p < 0.05): # - Vehicle theft shows
significant patterns in three areas (Southwest p=0.00149, Central
p=0.03326, 77th Street p=0.03744) # - These trends are highly reliable
with R² > 0.92, suggesting strong predictability
Notable but Not Significant: # - Battery cases show strong patterns (R² > 0.81) but fall just short of significance (p ≈ 0.06-0.10) # - Pacific area shows consistent moderate patterns (R² > 0.54) across all crime types
Areas Needing Further Investigation: # - Theft of Identity shows weak patterns across most areas (R² < 0.36) # - Southwest Battery shows no clear trend (R² = 0.001, p = 0.97)
library(dplyr)
library(ggplot2)
library(lubridate)
# Date formatting
crime_data$DATE.OCC <- as.Date(crime_data$DATE.OCC, format = "%m/%d/%Y")
# top 3 crimes in top 5 areas
crime_counts <- crime_data %>%
filter(year(DATE.OCC) >= 2020 & year(DATE.OCC) <= 2023,
AREA.NAME %in% top_areas$AREA.NAME,
Crm.Cd %in% top_crimes$Crm.Cd) %>%
group_by(AREA.NAME, Year = year(DATE.OCC), Crm.Cd.Desc) %>%
summarise(crime_count = n()) %>%
ungroup()
# Percent change from 2020 benchmark
crime_change <- crime_counts %>%
group_by(AREA.NAME, Crm.Cd.Desc) %>%
mutate(
benchmark_2020 = crime_count[Year == 2020],
percent_change = (crime_count - benchmark_2020) / benchmark_2020 * 100
) %>%
ungroup()
# line graph showing percent change from 2020 benchmark for each area
p <- ggplot(crime_change, aes(x = Year, y = percent_change, color = Crm.Cd.Desc, group = Crm.Cd.Desc)) +
geom_line() +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray") +
facet_wrap(~ AREA.NAME, ncol = 2) +
theme_minimal() +
labs(title = "Percent Change in Top 3 Crimes from 2020 Benchmark - Top 5 Areas",
x = "Year",
y = "Percent Change from 2020",
color = "Crime Type") +
theme(legend.position = "bottom",
plot.title = element_text(hjust = 0.5, size = 10)) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1))
print(p)
print(crime_change)
## # A tibble: 60 × 6
## AREA.NAME Year Crm.Cd.Desc crime_count benchmark_2020 percent_change
## <chr> <dbl> <chr> <int> <int> <dbl>
## 1 77th Street 2020 BATTERY - SIMPLE… 985 985 0
## 2 77th Street 2020 THEFT OF IDENTITY 446 446 0
## 3 77th Street 2020 VEHICLE - STOLEN 1589 1589 0
## 4 77th Street 2021 BATTERY - SIMPLE… 1012 985 2.74
## 5 77th Street 2021 THEFT OF IDENTITY 542 446 21.5
## 6 77th Street 2021 VEHICLE - STOLEN 1774 1589 11.6
## 7 77th Street 2022 BATTERY - SIMPLE… 1050 985 6.60
## 8 77th Street 2022 THEFT OF IDENTITY 2039 446 357.
## 9 77th Street 2022 VEHICLE - STOLEN 1804 1589 13.5
## 10 77th Street 2023 BATTERY - SIMPLE… 1236 985 25.5
## # ℹ 50 more rows
YEAR-OVER-YEAR CRIME TREND ANALYSIS (2020-2023)
Battery - Simple Assault: # - Central shows highest spikes in 2021-2022 (~25%) # - Hollywood shows initial increase then decline # - Pacific maintains moderate, stable increases # - 77th Street shows fluctuating pattern
Theft of Identity: # - 77th Street shows dramatic spike in 2022 (~250%) # - Southwest shows significant increase in 2022 (~200%) # - All areas show sharp decline in 2023 # - Central shows more moderate increases
Vehicle - Stolen: # - Central shows consistent high increases (40% in 2021-2022) # - Hollywood shows decline in 2023 # - Pacific and Southwest show moderate increases # - More stable patterns than other crime types
Overall Patterns: # - 2022 was peak year for most crimes # - 2023 shows general declining trend # - Central area shows highest volatility # - Different crime types show distinct patterns by area
# Load necessary libraries
library(dplyr)
library(ggplot2)
# 1. Summary Statistics
summary_stats <- crime_counts %>%
group_by(AREA.NAME, Crm.Cd.Desc) %>%
summarise(
mean_count = mean(crime_count, na.rm = TRUE),
sd_count = sd(crime_count, na.rm = TRUE),
min_count = min(crime_count, na.rm = TRUE),
max_count = max(crime_count, na.rm = TRUE),
.groups = 'drop' # This is important to avoid grouped data frame issues
)
# 2. ANOVA test (corrected)
anova_test <- aov(crime_count ~ AREA.NAME + Crm.Cd.Desc, data = crime_counts)
anova_summary <- summary(anova_test)
# 3. Linear Regression for Trend Analysis
trend_analysis <- crime_counts %>%
group_by(AREA.NAME, Crm.Cd.Desc) %>%
do(model = lm(crime_count ~ Year, data = .))
# 4. Calculate year-over-year changes
yoy_changes <- crime_counts %>%
group_by(AREA.NAME, Crm.Cd.Desc) %>%
arrange(Year) %>%
mutate(
prev_year_count = lag(crime_count),
pct_change = (crime_count - prev_year_count) / prev_year_count * 100
) %>%
ungroup() # Ungroup to avoid issues in plotting
# Create the visualization
ggplot(yoy_changes,
aes(x = as.factor(Year),
y = pct_change,
fill = AREA.NAME)) +
geom_bar(stat = "identity", position = "dodge", width = 0.7) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
facet_wrap(~Crm.Cd.Desc, scales = "free_y") +
theme_minimal() +
scale_fill_brewer(palette = "Set2") +
labs(title = "Year-over-Year Percentage Change in Crime Counts",
subtitle = "By Area and Crime Type (2020-2023)",
x = "Year",
y = "Percentage Change (%)",
fill = "Area") +
theme(
plot.title = element_text(hjust = 0.5, size = 12),
plot.subtitle = element_text(hjust = 0.5, size = 10),
axis.text = element_text(size = 10),
legend.position = "bottom",
strip.text = element_text(size = 10, face = "bold")
)
# Print results
print("Summary Statistics:")
## [1] "Summary Statistics:"
print(summary_stats)
## # A tibble: 15 × 6
## AREA.NAME Crm.Cd.Desc mean_count sd_count min_count max_count
## <chr> <chr> <dbl> <dbl> <int> <int>
## 1 77th Street BATTERY - SIMPLE ASSAULT 1071. 113. 985 1236
## 2 77th Street THEFT OF IDENTITY 998. 730. 446 2039
## 3 77th Street VEHICLE - STOLEN 1805 191. 1589 2053
## 4 Central BATTERY - SIMPLE ASSAULT 1608. 254. 1340 1829
## 5 Central THEFT OF IDENTITY 443. 216. 248 744
## 6 Central VEHICLE - STOLEN 970. 310. 594 1247
## 7 Hollywood BATTERY - SIMPLE ASSAULT 1048 129. 879 1181
## 8 Hollywood THEFT OF IDENTITY 590. 113. 478 735
## 9 Hollywood VEHICLE - STOLEN 827 90.9 709 901
## 10 Pacific BATTERY - SIMPLE ASSAULT 767 40.5 734 826
## 11 Pacific THEFT OF IDENTITY 629. 126. 464 766
## 12 Pacific VEHICLE - STOLEN 1345 174. 1086 1453
## 13 Southwest BATTERY - SIMPLE ASSAULT 1034. 43.7 972 1074
## 14 Southwest THEFT OF IDENTITY 862. 567. 473 1692
## 15 Southwest VEHICLE - STOLEN 1334. 220. 1068 1586
print("\nANOVA Results:")
## [1] "\nANOVA Results:"
print(anova_summary)
## Df Sum Sq Mean Sq F value Pr(>F)
## AREA.NAME 4 1530409 382602 2.97 0.028 *
## Crm.Cd.Desc 2 3252930 1626465 12.62 3.3e-05 ***
## Residuals 53 6833036 128925
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Year-over-Year Trends by Crime Type: # Battery - Simple Assault: # - Peak increases in 2021-2022 # - Central area showed highest variability # - Hollywood shows declining trend in 2023
# Theft of Identity: # - Dramatic spike in 2022 (>200% in some areas) # - Consistent decline in 2023 across all areas # - 77th Street shows most stable pattern
# Vehicle - Stolen: # - Central area shows persistent increase # - Most areas peak in 2021-2022 # - Varying patterns of decline in 2023
Key Findings for Policy: # - Crime type is stronger predictor than location # - 2022 was peak year for most crime types # - Different crimes show distinct geographic patterns
Which neighborhoods in Los Angeles have experienced the most significant increases in crime rates from 2020 to 2023 with particular attention to demographic factors (race and sex of victims), and what insights can be drawn from Area3’s significant increase in crimes?
library(tidyr) # Load tidyr for pivot_wider function
library(knitr) # Load knitr for kable function
# Filter to include only years up to 2023
crime_data_filtered <- crime_data %>%
filter(year(DATE.OCC) < 2024)
# Calculate year-over-year percentage changes
crime_summary <- crime_data_filtered %>%
mutate(Year = year(DATE.OCC)) %>%
filter(Year %in% c(2020, 2023)) %>% # Only for years 2020 and 2023
group_by(AREA.NAME, Year) %>%
summarise(crime_count = n(), .groups = "drop") %>%
pivot_wider(names_from = Year, values_from = crime_count, values_fill = list(crime_count = 0)) %>%
# Calculate percent change from 2020 to 2023
mutate(percent_change_2020_2023 = ((`2023` - `2020`) / `2020`) * 100)
# Display the results
kable(crime_summary, format = "markdown", digits = 2)
| AREA.NAME | 2020 | 2023 | percent_change_2020_2023 |
|---|---|---|---|
| 77th Street | 13343 | 13944 | 4.50 |
| Central | 11600 | 16957 | 46.18 |
| Devonshire | 7982 | 9762 | 22.30 |
| Foothill | 7106 | 7155 | 0.69 |
| Harbor | 8874 | 9125 | 2.83 |
| Hollenbeck | 7805 | 8401 | 7.64 |
| Hollywood | 10171 | 11447 | 12.55 |
| Mission | 8480 | 8973 | 5.81 |
| N Hollywood | 10167 | 11548 | 13.58 |
| Newton | 9994 | 11926 | 19.33 |
| Northeast | 8452 | 9707 | 14.85 |
| Olympic | 9639 | 11739 | 21.79 |
| Pacific | 11575 | 13772 | 18.98 |
| Rampart | 9025 | 11527 | 27.72 |
| Southeast | 10847 | 11288 | 4.07 |
| Southwest | 11178 | 13109 | 17.28 |
| Topanga | 8106 | 9636 | 18.87 |
| Van Nuys | 8763 | 9936 | 13.39 |
| West LA | 9309 | 10578 | 13.63 |
| West Valley | 8091 | 9930 | 22.73 |
| Wilshire | 9291 | 11707 | 26.00 |
# Ensure we have the correct top areas based on overall increase from 2020 to 2023
top_areas <- crime_summary %>%
arrange(desc(percent_change_2020_2023)) %>%
head(5) %>%
pull(AREA.NAME)
# Calculate crime counts and percentage change by area and crime type
crime_analysis <- crime_data %>%
filter(year(DATE.OCC) %in% c(2020, 2023), AREA.NAME %in% top_areas) %>%
mutate(Year = year(DATE.OCC)) %>%
group_by(AREA.NAME, Crm.Cd.Desc, Year) %>%
summarise(count = n(), .groups = "drop") %>%
pivot_wider(names_from = Year, values_from = count) %>%
# Only fill missing years with 0 where genuinely needed after checking
mutate(
count_2020 = replace_na(`2020`, 0),
count_2023 = replace_na(`2023`, 0),
percent_change = ifelse(count_2020 > 0, ((count_2023 - count_2020) / count_2020) * 100, NA),
absolute_change = count_2023 - count_2020
) %>%
filter(!is.na(percent_change)) %>%
arrange(AREA.NAME, desc(percent_change))
# Get top 3 most increased crimes for each area
top_crimes_by_area <- crime_analysis %>%
group_by(AREA.NAME) %>%
filter(percent_change > 0) %>% # Only include increases
slice_max(percent_change, n = 3) %>% # Get top 3 crimes by percent change
arrange(AREA.NAME, desc(percent_change))
# Display the results
print(top_crimes_by_area)
## # A tibble: 15 × 8
## # Groups: AREA.NAME [5]
## AREA.NAME Crm.Cd.Desc `2020` `2023` count_2020 count_2023 percent_change
## <chr> <chr> <int> <int> <int> <int> <dbl>
## 1 Central VEHICLE, STOL… 1 56 1 56 5500
## 2 Central PICKPOCKET 15 280 15 280 1767.
## 3 Central SHOPLIFTING-G… 15 71 15 71 373.
## 4 Devonshire VEHICLE, STOL… 1 28 1 28 2700
## 5 Devonshire PICKPOCKET 1 12 1 12 1100
## 6 Devonshire SODOMY/SEXUAL… 1 9 1 9 800
## 7 Rampart PICKPOCKET 2 50 2 50 2400
## 8 Rampart VEHICLE, STOL… 2 41 2 41 1950
## 9 Rampart SHOPLIFTING-G… 6 120 6 120 1900
## 10 West Valley BOMB SCARE 1 11 1 11 1000
## 11 West Valley STALKING 1 4 1 4 300
## 12 West Valley BATTERY ON A … 2 7 2 7 250
## 13 Wilshire VEHICLE, STOL… 1 34 1 34 3300
## 14 Wilshire PICKPOCKET 5 77 5 77 1440
## 15 Wilshire SHOPLIFTING -… 216 1381 216 1381 539.
## # ℹ 1 more variable: absolute_change <int>
Based on the year-over-year percentage change analysis from 2020 to 2023:
library(dplyr)
library(ggplot2)
library(lubridate)
# Crime changes by Area, Crime Type, Sex, and Race
crime_analysis <- crime_data %>%
filter(year(DATE.OCC) %in% c(2020, 2023)) %>%
group_by(AREA.NAME, Crm.Cd.Desc, Vict.Sex, Vict.Descent, Year = year(DATE.OCC)) %>%
summarise(count = n(), .groups = "drop") %>%
pivot_wider(names_from = Year,
values_from = count,
names_prefix = "count_") %>%
mutate(
percent_change = ((count_2023 - count_2020) / count_2020) * 100,
absolute_change = count_2023 - count_2020
) %>%
filter(!is.na(percent_change))
# 1. Crime Change by Area and Sex
filtered_crime_analysis <- crime_analysis %>%
filter(Vict.Sex %in% c("M", "F", "X"))
p1 <- ggplot(filtered_crime_analysis,
aes(x = AREA.NAME, y = percent_change, fill = Vict.Sex)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Crime Change by Area and Victim Sex (2020-2023)",
x = "Area",
y = "Percent Change",
fill = "Victim Sex")
# 2. Simplified Crime Change by Area and Victim Descent
# Filtering top 5 areas and focusing on significant changes
top_areas <- crime_summary %>%
arrange(desc(percent_change_2020_2023)) %>%
head(5) %>%
pull(AREA.NAME)
# Filter the data for top areas and descent groups with significant changes
simplified_crime_analysis <- crime_analysis %>%
filter(AREA.NAME %in% top_areas) %>%
group_by(AREA.NAME, Vict.Descent) %>%
summarise(avg_percent_change = mean(percent_change, na.rm = TRUE), .groups = 'drop') %>%
filter(avg_percent_change > 100) # Threshold to highlight significant changes
# Create the simplified plot
crime_analysis_filtered <- crime_analysis %>%
filter(Vict.Descent != "X") %>% # Remove unknown values
group_by(AREA.NAME, Vict.Descent) %>%
summarise(percent_change = mean(percent_change, na.rm = TRUE))
# Create the plot with filtered data
p2 <- ggplot(simplified_crime_analysis,
aes(x = AREA.NAME, y = avg_percent_change, fill = Vict.Descent)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Significant Crime Change by Area and Victim Descent (Top 5 Areas, 2020-2023)",
x = "Area",
y = "Average Percent Change",
fill = "Victim Descent") +
scale_fill_brewer(palette = "Set3")
print(p1)
print(p2)
# Summary table
summary_table <- crime_analysis %>%
group_by(AREA.NAME, Vict.Sex, Vict.Descent) %>%
summarise(
avg_percent_change = mean(percent_change, na.rm = TRUE),
total_crimes_2020 = sum(count_2020, na.rm = TRUE),
total_crimes_2023 = sum(count_2023, na.rm = TRUE)
) %>%
arrange(desc(avg_percent_change))
print(summary_table)
## # A tibble: 498 × 6
## # Groups: AREA.NAME, Vict.Sex [85]
## AREA.NAME Vict.Sex Vict.Descent avg_percent_change total_crimes_2020
## <chr> <chr> <chr> <dbl> <int>
## 1 Wilshire "X" "X" 2159. 114
## 2 N Hollywood "X" "X" 1415. 29
## 3 Foothill "X" "X" 1128. 11
## 4 Devonshire "" "" 747. 946
## 5 Harbor "" "" 737. 1276
## 6 Olympic "F" "C" 652 9
## 7 Southeast "F" "I" 600 1
## 8 Central "M" "I" 550 2
## 9 Topanga "" "" 541. 864
## 10 Devonshire "F" "C" 500 1
## # ℹ 488 more rows
## # ℹ 1 more variable: total_crimes_2023 <int>
2020 to 2023, we observe the following:
A = “Other Asian”,B = “Black”, C = “Chinese”, D = “Cambodian”, F = “Filipino”, G = “Guamanian”, H = “Hispanic/Latin/Mexican”, I = “American Indian/Alaskan Native”, J = “Japanese”, K = “Korean”, L = “Laotian”, O = “Other”, P = “Pacific Islander”, S = “Samoan”, U = “Hawaiian”, V = “Vietnamese”, W = “White”, X = “Unknown”, Z = “Asian Indian”
How do crime rates fluctuate during the holiday season, particularly in November and December, and what types of crimes predominantly occur during these months? Additionally, how have these patterns evolved over the years?
head(crime$Date.Rptd)
## [1] "03/01/2020 12:00:00 AM" "02/09/2020 12:00:00 AM" "11/11/2020 12:00:00 AM"
## [4] "05/10/2023 12:00:00 AM" "08/18/2022 12:00:00 AM" "04/04/2023 12:00:00 AM"
library(dplyr)
library(ggplot2)
library(lubridate)
library(plotly)
# Display the structure to understand the format of the Date column
str(crime)
## 'data.frame': 986500 obs. of 28 variables:
## $ DR_NO : int 190326475 200106753 200320258 200907217 220614831 231808869 230110144 220314085 231309864 211904005 ...
## $ Date.Rptd : chr "03/01/2020 12:00:00 AM" "02/09/2020 12:00:00 AM" "11/11/2020 12:00:00 AM" "05/10/2023 12:00:00 AM" ...
## $ DATE.OCC : chr "03/01/2020 12:00:00 AM" "02/08/2020 12:00:00 AM" "11/04/2020 12:00:00 AM" "03/10/2020 12:00:00 AM" ...
## $ TIME.OCC : int 2130 1800 1700 2037 1200 2300 900 1110 1400 1220 ...
## $ AREA : int 7 1 3 9 6 18 1 3 13 19 ...
## $ AREA.NAME : chr "Wilshire" "Central" "Southwest" "Van Nuys" ...
## $ Rpt.Dist.No : int 784 182 356 964 666 1826 182 303 1375 1974 ...
## $ Part.1.2 : int 1 1 1 1 2 2 2 2 2 2 ...
## $ Crm.Cd : int 510 330 480 343 354 354 354 354 354 624 ...
## $ Crm.Cd.Desc : chr "VEHICLE - STOLEN" "BURGLARY FROM VEHICLE" "BIKE - STOLEN" "SHOPLIFTING-GRAND THEFT ($950.01 & OVER)" ...
## $ Mocodes : chr "" "1822 1402 0344" "0344 1251" "0325 1501" ...
## $ Vict.Age : int 0 47 19 19 28 41 25 27 24 26 ...
## $ Vict.Sex : chr "M" "M" "X" "M" ...
## $ Vict.Descent : chr "O" "O" "X" "O" ...
## $ Premis.Cd : int 101 128 502 405 102 501 502 248 750 502 ...
## $ Premis.Desc : chr "STREET" "BUS STOP/LAYOVER (ALSO QUERY 124)" "MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)" "CLOTHING STORE" ...
## $ Weapon.Used.Cd: int NA NA NA NA NA NA NA NA NA 400 ...
## $ Weapon.Desc : chr "" "" "" "" ...
## $ Status : chr "AA" "IC" "IC" "IC" ...
## $ Status.Desc : chr "Adult Arrest" "Invest Cont" "Invest Cont" "Invest Cont" ...
## $ Crm.Cd.1 : int 510 330 480 343 354 354 354 354 354 624 ...
## $ Crm.Cd.2 : int 998 998 NA NA NA NA NA NA NA NA ...
## $ Crm.Cd.3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ Crm.Cd.4 : int NA NA NA NA NA NA NA NA NA NA ...
## $ LOCATION : chr "1900 S LONGWOOD AV" "1000 S FLOWER ST" "1400 W 37TH ST" "14000 RIVERSIDE DR" ...
## $ Cross.Street : chr "" "" "" "" ...
## $ LAT : num 34 34 34 34.2 34.1 ...
## $ LON : num -118 -118 -118 -118 -118 ...
# Print the first few entries of the Date.Rptd column to check its format
print(head(crime$Date.Rptd))
## [1] "03/01/2020 12:00:00 AM" "02/09/2020 12:00:00 AM" "11/11/2020 12:00:00 AM"
## [4] "05/10/2023 12:00:00 AM" "08/18/2022 12:00:00 AM" "04/04/2023 12:00:00 AM"
# Assuming the date is in a column named 'Date.Rptd' and format is 'mm/dd/yyyy'
crime$Date.Rptd <- as.Date(crime$Date.Rptd, format="%m/%d/%Y")
# Check if there were any conversion errors
sum(is.na(crime$Date.Rptd))
## [1] 0
# Extract Year, Month, and Day from the Date.Rptd
crime$Year <- year(crime$Date.Rptd)
crime$Month <- month(crime$Date.Rptd)
crime$Day <- day(crime$Date.Rptd)
# Summarize total crimes by month and year
monthly_crime_summary <- crime %>%
group_by(Year, Month) %>%
summarise(Total_Crimes = n(), .groups = 'drop')
# Plot total crimes by month and year
mcs <- ggplot(monthly_crime_summary, aes(x = Month, y = Total_Crimes, group = Year, color = as.factor(Year))) +
geom_line() +
labs(title = "Monthly Crime Trends by Year", x = "Month", y = "Total Crimes") +
scale_x_continuous(breaks = 1:12, labels = month.abb)
ggplotly(mcs)
# Analyzing crime types during November and December across all years
holiday_crimes <- crime %>%
filter(Month %in% c(11, 12)) %>%
group_by(Year, Crm.Cd.Desc) %>%
summarise(Total_Crimes = n(), .groups = 'drop') %>%
arrange(Year, desc(Total_Crimes))
# Print the top crime types during holidays
print(holiday_crimes)
## # A tibble: 455 × 3
## Year Crm.Cd.Desc Total_Crimes
## <dbl> <chr> <int>
## 1 2020 VEHICLE - STOLEN 3522
## 2 2020 BATTERY - SIMPLE ASSAULT 2433
## 3 2020 BURGLARY 2071
## 4 2020 BURGLARY FROM VEHICLE 1960
## 5 2020 VANDALISM - FELONY ($400 & OVER, ALL CHURCH VANDALISMS) 1957
## 6 2020 ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT 1933
## 7 2020 THEFT PLAIN - PETTY ($950 & UNDER) 1741
## 8 2020 INTIMATE PARTNER - SIMPLE ASSAULT 1621
## 9 2020 THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER) 1411
## 10 2020 THEFT OF IDENTITY 1260
## # ℹ 445 more rows
# Plotting top crime types during holiday months
top_crime_types <- holiday_crimes %>%
group_by(Crm.Cd.Desc) %>%
summarise(Total_Crimes = sum(Total_Crimes), .groups = 'drop') %>%
top_n(20, Total_Crimes)
# Filter the main data to include only these top crime types
filtered_crimes <- holiday_crimes %>%
filter(Crm.Cd.Desc %in% top_crime_types$Crm.Cd.Desc)
# Plot with top crime types
fc <- ggplot(filtered_crimes, aes(x = reorder(Crm.Cd.Desc, -Total_Crimes), y = Total_Crimes, fill = as.factor(Year))) +
geom_bar(stat = "identity", position = position_dodge()) +
labs(title = "Top 20 Crime Types During Holidays", x = "Crime Type", y = "Total Crimes") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
coord_flip() # This flips the axes to make the plot horizontal
ggplotly(fc)
# Annual trend of total crimes during November and December
annual_holiday_crime_trends <- holiday_crimes %>%
group_by(Year) %>%
summarise(Total_Crimes = sum(Total_Crimes), .groups = 'drop')
ahc <- ggplot(annual_holiday_crime_trends, aes(x = Year, y = Total_Crimes)) +
geom_line(group=1, color="blue") +
geom_point(color="red") +
labs(title = "Annual Holiday Crime Trends", x = "Year", y = "Total Crimes")
ggplotly(ahc)
# Add a day of the week column
crime$Day_of_Week <- wday(crime$Date.Rptd, label = TRUE, abbr = FALSE)
# Summarize crimes by day of the week during the holiday months
weekday_holiday_crime_summary <- crime %>%
filter(Month %in% c(11, 12)) %>%
group_by(Day_of_Week) %>%
summarise(Total_Crimes = n(), .groups = 'drop') %>%
arrange(Day_of_Week)
dow <- ggplot(weekday_holiday_crime_summary, aes(x = Day_of_Week, y = Total_Crimes, fill = Day_of_Week)) +
geom_bar(stat = "identity") +
labs(title = "Crime Distribution by Day of the Week During Holidays", x = "Day of the Week", y = "Total Crimes")
ggplotly(dow)
Seasonal Trends: The first plot, displaying monthly crime trends by year, shows distinct seasonal fluctuations. Typically, crime rates appear to increase during certain months, which might correlate with seasonal activities or societal patterns. Identifying these peaks can help in planning better law enforcement deployment during high-risk times.
Top Crimes During Holidays: The list and plot of top crime types during November and December for each year reveal that certain types of crimes, such as vehicle theft, burglary, and assault, consistently rank high during the holiday seasons. This pattern suggests a need for targeted preventive measures during these periods when specific crimes spike, possibly due to the increased opportunity (with many homes potentially left empty and more retail activity).
Visual Accessibility: The bar chart displaying top 20 crime types during the holidays uses a horizontal layout, making it easier to read and compare the frequency of different crimes. This visualization helps stakeholders quickly identify the most prevalent crimes during the holiday seasons, which could inform public awareness campaigns and policing strategies.
Detailed Examination: The detailed breakdown by crime type, year, and month provides a granular view of crime trends, offering insights not only into the ‘what’ and ‘when’ but potentially the ‘why’ of crime patterns. For instance, understanding that thefts increase during November and December could be linked to the higher volume of shopping and unattended properties during the holiday season.
*** END OF QUESTION 3 ***
How does the type or frequency of crimes in Los Angeles vary by victim descent, and are there significant geographic patterns (based on latitude and longitude) associated with specific victim groups?
library(tidyverse)
library(ggplot2)
library(sf)
library(cluster)
library(ggmap)
library(dplyr)
names(crime_data)
## [1] "DR_NO" "Date.Rptd" "DATE.OCC" "TIME.OCC" "AREA"
## [6] "AREA.NAME" "Crm.Cd" "Crm.Cd.Desc" "Vict.Age" "Vict.Sex"
## [11] "Vict.Descent" "Weapon.Desc" "LOCATION" "LAT" "LON"
crime_frequency <- crime_data %>%
summarise(Frequency = n()) %>%
arrange(desc(Frequency))
crime_data %>%
group_by(Vict.Descent) %>%
summarise(Frequency = n()) %>%
ggplot(aes(x = Vict.Descent, y = Frequency, fill = Vict.Descent)) +
geom_bar(stat = "identity") +
labs(title = "Total Crimes by Victim Descent", x = "Victim Descent", y = "Frequency") +
theme_minimal()
crime_data$Time_Period <- cut(crime_data$TIME.OCC,
breaks = c(0, 600, 1200, 1800, 2400),
labels = c("Night", "Morning", "Afternoon", "Evening"))
crime_data %>%
group_by(Vict.Descent, Time_Period) %>%
summarise(Frequency = n()) %>%
ggplot(aes(x = Vict.Descent, y = Time_Period, fill = Frequency)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
labs(title = "Crime Frequency by Victim Descent and Time of Day", x = "Victim Descent", y = "Time of Day") +
theme_minimal()
library(lubridate)
crime_data$Date.OCC <- as.Date(crime_data$DATE.OCC, format = "%m/%d/%Y")
crime_data$Month <- floor_date(crime_data$Date.OCC, "month")
crime_data %>%
group_by(Month, Vict.Descent) %>%
summarise(Frequency = n()) %>%
ggplot(aes(x = Month, y = Frequency, color = Vict.Descent)) +
geom_line(size = 1) +
labs(title = "Crime Trend Over Time by Victim Descent", x = "Year", y = "Frequency") +
theme_minimal()
theme_minimal()
## List of 136
## $ line :List of 6
## ..$ colour : chr "black"
## ..$ linewidth : num 0.5
## ..$ linetype : num 1
## ..$ lineend : chr "butt"
## ..$ arrow : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_line" "element"
## $ rect :List of 5
## ..$ fill : chr "white"
## ..$ colour : chr "black"
## ..$ linewidth : num 0.5
## ..$ linetype : num 1
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ text :List of 11
## ..$ family : chr ""
## ..$ face : chr "plain"
## ..$ colour : chr "black"
## ..$ size : num 11
## ..$ hjust : num 0.5
## ..$ vjust : num 0.5
## ..$ angle : num 0
## ..$ lineheight : num 0.9
## ..$ margin : 'margin' num [1:4] 0points 0points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ title : NULL
## $ aspect.ratio : NULL
## $ axis.title : NULL
## $ axis.title.x :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 2.75points 0points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.x.top :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 0
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 2.75points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.x.bottom : NULL
## $ axis.title.y :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : num 90
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 2.75points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.y.left : NULL
## $ axis.title.y.right :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : num -90
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 0points 2.75points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : chr "grey30"
## ..$ size : 'rel' num 0.8
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 2.2points 0points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x.top :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 0
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 2.2points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x.bottom : NULL
## $ axis.text.y :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 1
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 2.2points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.y.left : NULL
## $ axis.text.y.right :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 0points 2.2points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.theta : NULL
## $ axis.text.r :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0.5
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 2.2points 0points 2.2points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.ticks : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ axis.ticks.x : NULL
## $ axis.ticks.x.top : NULL
## $ axis.ticks.x.bottom : NULL
## $ axis.ticks.y : NULL
## $ axis.ticks.y.left : NULL
## $ axis.ticks.y.right : NULL
## $ axis.ticks.theta : NULL
## $ axis.ticks.r : NULL
## $ axis.minor.ticks.x.top : NULL
## $ axis.minor.ticks.x.bottom : NULL
## $ axis.minor.ticks.y.left : NULL
## $ axis.minor.ticks.y.right : NULL
## $ axis.minor.ticks.theta : NULL
## $ axis.minor.ticks.r : NULL
## $ axis.ticks.length : 'simpleUnit' num 2.75points
## ..- attr(*, "unit")= int 8
## $ axis.ticks.length.x : NULL
## $ axis.ticks.length.x.top : NULL
## $ axis.ticks.length.x.bottom : NULL
## $ axis.ticks.length.y : NULL
## $ axis.ticks.length.y.left : NULL
## $ axis.ticks.length.y.right : NULL
## $ axis.ticks.length.theta : NULL
## $ axis.ticks.length.r : NULL
## $ axis.minor.ticks.length : 'rel' num 0.75
## $ axis.minor.ticks.length.x : NULL
## $ axis.minor.ticks.length.x.top : NULL
## $ axis.minor.ticks.length.x.bottom: NULL
## $ axis.minor.ticks.length.y : NULL
## $ axis.minor.ticks.length.y.left : NULL
## $ axis.minor.ticks.length.y.right : NULL
## $ axis.minor.ticks.length.theta : NULL
## $ axis.minor.ticks.length.r : NULL
## $ axis.line : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ axis.line.x : NULL
## $ axis.line.x.top : NULL
## $ axis.line.x.bottom : NULL
## $ axis.line.y : NULL
## $ axis.line.y.left : NULL
## $ axis.line.y.right : NULL
## $ axis.line.theta : NULL
## $ axis.line.r : NULL
## $ legend.background : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ legend.margin : 'margin' num [1:4] 5.5points 5.5points 5.5points 5.5points
## ..- attr(*, "unit")= int 8
## $ legend.spacing : 'simpleUnit' num 11points
## ..- attr(*, "unit")= int 8
## $ legend.spacing.x : NULL
## $ legend.spacing.y : NULL
## $ legend.key : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ legend.key.size : 'simpleUnit' num 1.2lines
## ..- attr(*, "unit")= int 3
## $ legend.key.height : NULL
## $ legend.key.width : NULL
## $ legend.key.spacing : 'simpleUnit' num 5.5points
## ..- attr(*, "unit")= int 8
## $ legend.key.spacing.x : NULL
## $ legend.key.spacing.y : NULL
## $ legend.frame : NULL
## $ legend.ticks : NULL
## $ legend.ticks.length : 'rel' num 0.2
## $ legend.axis.line : NULL
## $ legend.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : 'rel' num 0.8
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ legend.text.position : NULL
## $ legend.title :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ legend.title.position : NULL
## $ legend.position : chr "right"
## $ legend.position.inside : NULL
## $ legend.direction : NULL
## $ legend.byrow : NULL
## $ legend.justification : chr "center"
## $ legend.justification.top : NULL
## $ legend.justification.bottom : NULL
## $ legend.justification.left : NULL
## $ legend.justification.right : NULL
## $ legend.justification.inside : NULL
## $ legend.location : NULL
## $ legend.box : NULL
## $ legend.box.just : NULL
## $ legend.box.margin : 'margin' num [1:4] 0cm 0cm 0cm 0cm
## ..- attr(*, "unit")= int 1
## $ legend.box.background : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ legend.box.spacing : 'simpleUnit' num 11points
## ..- attr(*, "unit")= int 8
## [list output truncated]
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi TRUE
## - attr(*, "validate")= logi TRUE
library(dplyr)
library(ggplot2)
crime_distribution <- crime_data %>%
group_by(Vict.Descent) %>%
summarise(Frequency = n(), .groups = "drop") %>%
mutate(Percentage = Frequency / sum(Frequency) * 100)
ggplot(crime_distribution, aes(x = "", y = Percentage, fill = Vict.Descent)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y") +
labs(title = "Distribution of Crimes by Victim Descent",
fill = "Victim Descent") +
theme_minimal() +
theme(axis.text.x = element_blank())
library(ggplot2)
library(dplyr)
victim_counts <- crime_data %>%
count(Vict.Descent)
ggplot(victim_counts, aes(x = Vict.Descent, y = n)) +
geom_bar(stat = "identity") +
labs(title = "Count of Crimes by Victim Descent",
x = "Victim Descent",
y = "Count of Crimes") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme_minimal()
# ANOVA for Longitude
anova_lon <- aov(LON ~ Vict.Descent, data = crime_data)
summary(anova_lon)
## Df Sum Sq Mean Sq F value Pr(>F)
## Vict.Descent 20 10754 538 16.8 <2e-16 ***
## Residuals 986479 31613033 32
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANOVA for Latitude
anova_lat <- aov(LAT ~ Vict.Descent, data = crime_data)
summary(anova_lat)
## Df Sum Sq Mean Sq F value Pr(>F)
## Vict.Descent 20 1380 69.0 25.9 <2e-16 ***
## Residuals 986479 2631056 2.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(dplyr)
library(tidyr)
unique_values <- unique(crime_data$Vict.Descent)
print(unique_values)
## [1] "O" "X" "H" "B" "W" "" "A" "K" "C" "J" "F" "I" "V" "S" "P" "Z" "G" "U" "D"
## [20] "L" "-"
relevant_data <- crime_data %>%
select(LON, LAT, Vict.Descent)
relevant_data$Vict.Descent <- as.factor(relevant_data$Vict.Descent)
dummy_data <- model.matrix(~ Vict.Descent - 1, data = relevant_data)
combined_data <- cbind(relevant_data[, c("LON", "LAT")], dummy_data)
cor_matrix <- cor(combined_data, use = "pairwise.complete.obs")
head(cor_matrix)
## LON LAT Vict.Descent Vict.Descent- Vict.DescentA
## LON 1.00e+00 -9.98e-01 -0.013629 -7.61e-05 -0.00289
## LAT -9.98e-01 1.00e+00 0.012784 9.36e-05 0.00345
## Vict.Descent -1.36e-02 1.28e-02 1.000000 -5.78e-04 -0.05999
## Vict.Descent- -7.61e-05 9.36e-05 -0.000578 1.00e+00 -0.00021
## Vict.DescentA -2.89e-03 3.45e-03 -0.059991 -2.10e-04 1.00000
## Vict.DescentB 3.68e-03 -1.21e-02 -0.161268 -5.66e-04 -0.05870
## Vict.DescentB Vict.DescentC Vict.DescentD Vict.DescentF
## LON 0.003681 -2.62e-03 -3.69e-04 -1.55e-03
## LAT -0.012099 2.43e-03 3.14e-04 2.86e-03
## Vict.Descent -0.161268 -2.70e-02 -3.83e-03 -2.78e-02
## Vict.Descent- -0.000566 -9.45e-05 -1.34e-05 -9.74e-05
## Vict.DescentA -0.058699 -9.81e-03 -1.40e-03 -1.01e-02
## Vict.DescentB 1.000000 -2.64e-02 -3.75e-03 -2.72e-02
## Vict.DescentG Vict.DescentH Vict.DescentI Vict.DescentJ
## LON 2.10e-03 0.008348 -4.04e-05 -1.32e-03
## LAT -2.08e-03 -0.005730 -7.27e-05 8.24e-04
## Vict.Descent -3.47e-03 -0.263940 -1.28e-02 -1.59e-02
## Vict.Descent- -1.22e-05 -0.000926 -4.49e-05 -5.57e-05
## Vict.DescentA -1.26e-03 -0.096071 -4.66e-03 -5.78e-03
## Vict.DescentB -3.39e-03 -0.258255 -1.25e-02 -1.55e-02
## Vict.DescentK Vict.DescentL Vict.DescentO Vict.DescentP
## LON -0.000673 -3.31e-04 -0.001218 -7.69e-04
## LAT 0.000709 1.68e-04 0.005302 8.65e-04
## Vict.Descent -0.031004 -3.49e-03 -0.118112 -6.83e-03
## Vict.Descent- -0.000109 -1.22e-05 -0.000414 -2.39e-05
## Vict.DescentA -0.011285 -1.27e-03 -0.042991 -2.49e-03
## Vict.DescentB -0.030336 -3.42e-03 -0.115568 -6.68e-03
## Vict.DescentS Vict.DescentU Vict.DescentV Vict.DescentW
## LON -2.98e-04 -7.15e-04 -3.67e-04 -0.004784
## LAT 7.31e-05 6.18e-04 5.05e-04 0.008464
## Vict.Descent -3.03e-03 -5.95e-03 -1.37e-02 -0.203293
## Vict.Descent- -1.06e-05 -2.09e-05 -4.82e-05 -0.000713
## Vict.DescentA -1.10e-03 -2.17e-03 -5.00e-03 -0.073996
## Vict.DescentB -2.97e-03 -5.82e-03 -1.34e-02 -0.198914
## Vict.DescentX Vict.DescentZ
## LON 0.009028 -1.33e-03
## LAT -0.011396 1.55e-03
## Vict.Descent -0.139342 -9.47e-03
## Vict.Descent- -0.000489 -3.32e-05
## Vict.DescentA -0.050719 -3.45e-03
## Vict.DescentB -0.136341 -9.27e-03
library(ggplot2)
library(reshape2)
library(corrplot)
numeric_data <- crime_data[sapply(crime_data, is.numeric)]
corr <- cor(numeric_data, use = "complete.obs") # use "complete.obs" to handle missing values
corrplot(corr, method = "color",
addCoef.col = "black", # Add correlation coefficients
tl.col = "black", # Text color for labels
tl.srt = 45, # Rotation of text labels
number.cex = 0.9, # Size of the correlation coefficient text
tl.cex = 0.9, # Size of the text label for variables
col = colorRampPalette(c("navy", "white", "firebrick3"))(200), # Refined color palette
title = "Correlation Matrix", # Title of the plot
mar = c(1,1,2,1), # Adjusted margins for spacing
cl.cex = 0.8, # Size of the color legend text
cl.pos = "r", # Position of the color legend to the right
diag = FALSE) # Remove diagonal for better clarity
ggplot(crime_data, aes(x = LON)) +
geom_density(aes(fill = Vict.Descent), alpha = 0.5) +
facet_wrap(~ Vict.Descent) +
labs(title = "Geographic Patterns of Crime (Longitude) by Victim Descent",
x = "Longitude",
y = "Density") +
theme_minimal()
ggplot(crime_data, aes(x = LAT)) +
geom_density(aes(fill = Vict.Descent), alpha = 0.5) +
facet_wrap(~ Vict.Descent) +
labs(title = "Geographic Patterns of Crime (Latitude) by Victim Descent",
x = "Latitude",
y = "Density") +
theme_minimal()
anova_result_1 <- aov(LAT ~ AREA, data = crime_data)
summary(anova_result_1)
## Df Sum Sq Mean Sq F value Pr(>F)
## AREA 1 1590 1590 596 <2e-16 ***
## Residuals 986498 2630846 3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_result_2 <- aov(LON ~ AREA, data = crime_data)
summary(anova_result_2)
## Df Sum Sq Mean Sq F value Pr(>F)
## AREA 1 3155 3155 98.4 <2e-16 ***
## Residuals 986498 31620632 32
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANOVA for Longitude
anova_lon <- aov(LON ~ Vict.Descent, data = crime_data)
summary(anova_lon)
## Df Sum Sq Mean Sq F value Pr(>F)
## Vict.Descent 20 10754 538 16.8 <2e-16 ***
## Residuals 986479 31613033 32
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANOVA for Latitude
anova_lat <- aov(LAT ~ Vict.Descent, data = crime_data)
summary(anova_lat)
## Df Sum Sq Mean Sq F value Pr(>F)
## Vict.Descent 20 1380 69.0 25.9 <2e-16 ***
## Residuals 986479 2631056 2.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Filter data for two specific victim descent groups and non-missing lat/lon values
geo_subset <- crime_data %>%
filter(Vict.Descent %in% c("H", "B"), !is.na(LAT), !is.na(LON))
# Ensure the data has exactly two levels for Vict.Descent
if (length(unique(geo_subset$Vict.Descent)) == 2) {
# T-test for differences in latitude and longitude between the two groups
latitude_t_test <- t.test(LAT ~ Vict.Descent, data = geo_subset)
longitude_t_test <- t.test(LON ~ Vict.Descent, data = geo_subset)
# Display results
list(Latitude_Test = latitude_t_test, Longitude_Test = longitude_t_test)
} else {
print("Data does not have exactly 2 groups for Vict.Descent. Please check the filtering.")
}
## $Latitude_Test
##
## Welch Two Sample t-test
##
## data: LAT by Vict.Descent
## t = -6, df = 3e+05, p-value = 6e-10
## alternative hypothesis: true difference in means between group B and group H is not equal to 0
## 95 percent confidence interval:
## -0.0465 -0.0242
## sample estimates:
## mean in group B mean in group H
## 33.9 34.0
##
##
## $Longitude_Test
##
## Welch Two Sample t-test
##
## data: LON by Vict.Descent
## t = -1, df = 3e+05, p-value = 0.3
## alternative hypothesis: true difference in means between group B and group H is not equal to 0
## 95 percent confidence interval:
## -0.0590 0.0185
## sample estimates:
## mean in group B mean in group H
## -118 -118
Which crimes in Los Angeles exhibited the highest weapon usage, and which ethnicities and genders showed the most significant weapon involvement over the last five years?
la_crime <- crime %>%
filter(!is.na(Weapon.Used.Cd))
tail(la_crime)
## DR_NO Date.Rptd DATE.OCC TIME.OCC AREA AREA.NAME
## 326363 241807402 2024-03-12 03/12/2024 12:00:00 AM 630 18 Southeast
## 326364 240405919 2024-02-27 02/26/2024 12:00:00 AM 1545 4 Hollenbeck
## 326365 241604405 2024-01-14 01/13/2024 12:00:00 AM 720 16 Foothill
## 326366 242004546 2024-01-16 01/16/2024 12:00:00 AM 1510 20 Olympic
## 326367 240104953 2024-01-15 01/15/2024 12:00:00 AM 100 1 Central
## 326368 240309674 2024-04-24 04/24/2024 12:00:00 AM 1500 3 Southwest
## Rpt.Dist.No Part.1.2 Crm.Cd
## 326363 1841 2 624
## 326364 422 1 761
## 326365 1673 1 230
## 326366 2013 2 624
## 326367 101 2 745
## 326368 358 1 230
## Crm.Cd.Desc
## 326363 BATTERY - SIMPLE ASSAULT
## 326364 BRANDISH WEAPON
## 326365 ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT
## 326366 BATTERY - SIMPLE ASSAULT
## 326367 VANDALISM - MISDEAMEANOR ($399 OR UNDER)
## 326368 ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT
## Mocodes Vict.Age Vict.Sex Vict.Descent Premis.Cd
## 326363 1300 1220 0400 33 M H 101
## 326364 0216 0334 0371 0906 18 M H 102
## 326365 0400 1822 1310 0416 36 M H 101
## 326366 2047 0400 0429 0444 1202 1822 80 F O 124
## 326367 0329 0400 0416 0 X X 503
## 326368 1822 0334 0416 0445 0449 1202 70 F W 102
## Premis.Desc Weapon.Used.Cd
## 326363 STREET 400
## 326364 SIDEWALK 101
## 326365 STREET 207
## 326366 BUS STOP 400
## 326367 HOTEL 500
## 326368 SIDEWALK 308
## Weapon.Desc Status Status.Desc
## 326363 STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE) AO Adult Other
## 326364 REVOLVER IC Invest Cont
## 326365 OTHER KNIFE IC Invest Cont
## 326366 STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE) IC Invest Cont
## 326367 UNKNOWN WEAPON/OTHER WEAPON IC Invest Cont
## 326368 STICK IC Invest Cont
## Crm.Cd.1 Crm.Cd.2 Crm.Cd.3 Crm.Cd.4
## 326363 624 NA NA NA
## 326364 761 NA NA NA
## 326365 230 NA NA NA
## 326366 624 NA NA NA
## 326367 745 NA NA NA
## 326368 230 NA NA NA
## LOCATION Cross.Street
## 326363 106TH ST BROADWAY
## 326364 THOMAS ST BALDWIN ST
## 326365 8700 LANKERSHIM BL
## 326366 HOBART BL 3RD ST
## 326367 1300 W SUNSET BL
## 326368 FLOWER ST JEFFERSON BL
## LAT LON Year Month Day Day_of_Week
## 326363 33.9 -118 2024 3 12 Tuesday
## 326364 34.1 -118 2024 2 27 Tuesday
## 326365 34.2 -118 2024 1 14 Sunday
## 326366 34.1 -118 2024 1 16 Tuesday
## 326367 34.1 -118 2024 1 15 Monday
## 326368 34.0 -118 2024 4 24 Wednesday
cols_to_remove <- c(
"Mocodes", "Rpt.Dist.No", "Part.1.2", "Crm.Cd.2","Crm.Cd.3","Crm.Cd.4",
"Premis_cd","Premis.Desc", "Status", "Status.Desc","Cross.Street"
)
# Drop the specified columns
la_crime<- la_crime[, !(names(la_crime) %in% cols_to_remove)]
la_crime <- la_crime %>% rename(
Division_NO = 'DR_NO',
Date_Reported = 'Date.Rptd',
Date_Occurred = 'DATE.OCC',
Time_Occurred = 'TIME.OCC',
Area_Code = 'AREA',
Area_Name = 'AREA.NAME',
Crime_Code = 'Crm.Cd',
Crime_Description = 'Crm.Cd.Desc',
Weapons_Used='Weapon.Used.Cd',
Weapons_Description='Weapon.Desc',
Victim_Age = 'Vict.Age',
Victim_Sex = 'Vict.Sex',
Victim_Descent = 'Vict.Descent',
Crime_Code_1 = 'Crm.Cd',
Location = 'LOCATION',
Latitude = 'LAT',
Longitude = 'LON'
)
print("Data after removing unnecessary columns:")
## [1] "Data after removing unnecessary columns:"
print(names(la_crime))
## [1] "Division_NO" "Date_Reported" "Date_Occurred"
## [4] "Time_Occurred" "Area_Code" "Area_Name"
## [7] "Crime_Code_1" "Crime_Description" "Victim_Age"
## [10] "Victim_Sex" "Victim_Descent" "Premis.Cd"
## [13] "Weapons_Used" "Weapons_Description" "Crm.Cd.1"
## [16] "Location" "Latitude" "Longitude"
## [19] "Year" "Month" "Day"
## [22] "Day_of_Week"
# Count occurrences of "H" in the Victim_Sex column
count_H <- sum(la_crime$Victim_Sex == "H", na.rm = TRUE)
# Print the result
print(count_H)
## [1] 27
# Remove rows where Victim_Sex is "H"
la_crime <- la_crime %>%
filter(Victim_Sex != "H")
# Print the result to verify
nrow(la_crime)
## [1] 326341
# Summarize and find top 10 crimes with weapon usage
top_crimes <- la_crime %>%
group_by(Crime_Description, Weapons_Description) %>%
summarise(count = n(), .groups = 'drop') %>%
arrange(desc(count)) %>%
slice_head(n = 10)
print(top_crimes)
## # A tibble: 10 × 3
## Crime_Description Weapons_Description count
## <chr> <chr> <int>
## 1 BATTERY - SIMPLE ASSAULT STRONG-ARM (HANDS, FIST… 67439
## 2 INTIMATE PARTNER - SIMPLE ASSAULT STRONG-ARM (HANDS, FIST… 43968
## 3 CRIMINAL THREATS - NO WEAPON DISPLAYED VERBAL THREAT 17559
## 4 ROBBERY STRONG-ARM (HANDS, FIST… 14010
## 5 ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT HAND GUN 7658
## 6 INTIMATE PARTNER - AGGRAVATED ASSAULT STRONG-ARM (HANDS, FIST… 7511
## 7 ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT STRONG-ARM (HANDS, FIST… 7122
## 8 ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT UNKNOWN WEAPON/OTHER WE… 5888
## 9 ROBBERY HAND GUN 5830
## 10 BATTERY - SIMPLE ASSAULT UNKNOWN WEAPON/OTHER WE… 5725
# Calculate percentages
top_crimes <- top_crimes %>%
mutate(percentage = count / sum(count) * 100)
summary <- top_crimes %>%
group_by(Crime_Description) %>%
summarise(total_count = sum(count)) %>%
mutate(percentage = (total_count / sum(total_count)) * 100)
custom_colors <- c(
"BATTERY - SIMPLE ASSAULT" = "#FF6F61",
"INTIMATE PARTNER - SIMPLE ASSAULT" = "#6B5B95",
"CRIMINAL THREATS - NO WEAPON DISPLAYED" = "#88B04B",
"ROBBERY" = "#F7CAC9",
"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT" = "#92A8D1",
"INTIMATE PARTNER - AGGRAVATED ASSAULT" = "#65bdc8",
"UNKNOWN WEAPON/OTHER WEAPON" = "#B565A7"
)
# Create the pie chart
ggplot(summary, aes(x = factor(1), y = total_count, fill = Crime_Description)) +
geom_bar(width = 1, stat = "identity") +
coord_polar(theta="y", start = 0) +
labs(title = "Distribution of Top 10 Crimes in LA with Most Weapons Usage") +
theme_void() +
scale_fill_manual(values = custom_colors) +
theme(legend.position = "right") +
geom_text(aes(label = sprintf("%.1f%%", percentage)),
position = position_stack(vjust = 0.5), color = "black")
weapons_sum <- la_crime %>%
group_by(Victim_Sex, Victim_Descent, Weapons_Description,Crime_Description) %>%
summarize(total_count = n(), .groups = 'drop') %>%
top_n(20, total_count) %>%
arrange(desc(total_count))
head(weapons_sum)
## # A tibble: 6 × 5
## Victim_Sex Victim_Descent Weapons_Description Crime_Description total_count
## <chr> <chr> <chr> <chr> <int>
## 1 F H STRONG-ARM (HANDS, FI… INTIMATE PARTNER… 18269
## 2 M H STRONG-ARM (HANDS, FI… BATTERY - SIMPLE… 18018
## 3 F H STRONG-ARM (HANDS, FI… BATTERY - SIMPLE… 16075
## 4 F B STRONG-ARM (HANDS, FI… INTIMATE PARTNER… 8371
## 5 F B STRONG-ARM (HANDS, FI… BATTERY - SIMPLE… 7486
## 6 M W STRONG-ARM (HANDS, FI… BATTERY - SIMPLE… 7125
custom_labels <- c("Hand Gun", "Strong Arm", "Verbal Threat")
ggplot(weapons_sum, aes(x = Weapons_Description, y = total_count, fill = Victim_Sex)) +
geom_bar(stat = "identity") +
facet_wrap(~ Victim_Descent) + # Separate panels for each descent
labs(title = "Total Weapons Used by Victim Sex and Descent",
x = "Weapon Description",
y = "Total Count") +
scale_x_discrete(labels = custom_labels) +
scale_y_continuous(limits = c(0, max(weapons_sum$total_count) * 1.1)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Analysis
1) Hand Gun usage is notably higher for Black (B) and Hispanic (H)
victims, with males being disproportionately represented. Strong Arm
(physical force) incidents are more frequent for White (W) victims, with
a balanced representation of both male and female victims.
Across all categories, males (teal) are more frequently victims, especially in cases involving firearms. Female victims (red) appear more frequently in non-weapon-based incidents such as Strong Arm cases (e.g., assaults).
Hispanic (H) victims have a high rate of gun-related incidents involving male victims. White (W) victims show a high count for Strong Arm incidents, with females almost matching male victims. Other (O) group mainly experiences incidents involving Strong Arm, with no significant firearm usage.
This visualization suggests that firearms are predominantly involved in incidents affecting Black and Hispanic males. Meanwhile, physical force is more common among White victims. Gender-based victimization patterns are also evident, with men more frequently involved in firearm-related incidents, whereas women are more often victims in physical confrontations.
# Total crimes by sex and ethnicity
total_crimes_by_sex_ethnicity <- la_crime %>%
filter(!is.na(Victim_Sex) & !is.na(Victim_Descent)) %>%
group_by(Victim_Sex, Victim_Descent) %>%
summarise(total_count = n(), .groups = 'drop') %>%
arrange(desc(total_count))
# Print the result
head(total_crimes_by_sex_ethnicity)
## # A tibble: 6 × 3
## Victim_Sex Victim_Descent total_count
## <chr> <chr> <int>
## 1 M H 77006
## 2 F H 73824
## 3 F B 38681
## 4 M B 31035
## 5 M W 29463
## 6 F W 24115
descent_labels <- c(
A = "Other Asian",B = "Black", C = "Chinese", D = "Cambodian", F = "Filipino", G = "Guamanian",
H = "Hispanic/Latin/Mexican", I = "American Indian/Alaskan Native", J = "Japanese", K = "Korean",
L = "Laotian", O = "Other", P = "Pacific Islander", S = "Samoan", U = "Hawaiian", V = "Vietnamese",
W = "White", X = "Unknown", Z = "Asian Indian"
)
# Create a heatmap
ggplot(total_crimes_by_sex_ethnicity, aes(x = factor(Victim_Descent, levels = names(descent_labels)),
y = Victim_Sex, fill = total_count)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
labs(title = "Heatmap of Total Crimes by Victim Sex and Ethnicity",
x = "Ethnicity",
y = "Victim Sex",
fill = "Total Crimes") +
scale_x_discrete(labels = descent_labels) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Analysis:
Males: 1) For several ethnic groups, males are involved in the majority of crimes compared to females. Black and Hispanic/Latino/Mexican groups exhibit the highest crime counts, indicated by darker shades.
Females: 1) Crimes against females appear less frequent overall, with relatively lighter shades. However, there is still notable representation among Black and Hispanic/Latino/Mexican victims, though to a lesser extent than males.
Unknown/Other Sex (X): 1) This category has significant counts across several ethnicities, particularly Black, Hispanic/Latino/Mexican, and White groups.
White victims also show a high frequency but with a more balanced sex distribution compared to other ethnic groups. Asian and Pacific Islander groups generally exhibit lower crime counts, as indicated by the lighter shades.
The gender distribution also shows that males are disproportionately affected across most ethnic groups, especially in the most impacted categories.
#Test for Independence
contingency_Sex <- table(la_crime$Victim_Sex, la_crime$Crime_Description)
contingency_descent <- table(la_crime$Victim_Descent, la_crime$Crime_Description)
chi_squared_result <- chisq.test(contingency_Sex)
print(chi_squared_result)
##
## Pearson's Chi-squared test
##
## data: contingency_Sex
## X-squared = 1e+05, df = 363, p-value <2e-16
chi_squared_result1 <- chisq.test(contingency_descent)
print(chi_squared_result1)
##
## Pearson's Chi-squared test
##
## data: contingency_descent
## X-squared = 95725, df = 2299, p-value <2e-16
There is a statistically significant association between the two categorical variables for both tests conducted between Victim Sex, Descent and Crime. Hence we can accept the alternate hypothesis that they are dependent on each other.
*** END OF QUESTION 5 ***